Udemy.com is an online learning platform with more than 100.000 courses and over 30 million students all over the world. The platform offers courses in different categories e.g. Business, Design or Marketing. With all the available options it is very hard to choose the proper course, since everyone has a different taste. A recommender system helps students choose the next course, without spending hours reading different course descriptions. It does not only spare time for the user, but helps to find something interesting based on their previous course choices.

Setup

!pip install squarify
Requirement already satisfied: squarify in /usr/local/lib/python3.7/dist-packages (0.4.3)
import pandas as pd
import numpy as np

import scipy.stats as st

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import requests
import os

import requests
import ast
import pickle
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from operator import itemgetter
from collections import Counter
import matplotlib
import squarify
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import euclidean_distances
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
import datetime
import scipy.stats as st
import ast
import re 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

Data Import

Import the courses

Import the Business courses from the Udemy API - limit is 10.000

Tip: You can go to Udemy’s site and request for Affiliate API access. For me, it took 7 hours to get approval.
username='oZAbbHY1iopJmQlRBmUOvepJVAmEadBfLARYO42N'
pw = os.environ['udemy_client_secret']

list_json=[]
url='https://www.udemy.com/api-2.0/courses/?fields[course]=@all&page=1&category=Business'

global_counter = 0
local_counter = 0

while url!=None:
  if not os.path.exists(f"./courses_{global_counter+10}.txt"):
    try:
        local_counter+=1
        data_json=get_data(url, username, pw)
        url=data_json['next']
        list_json.extend(data_json['results'])
        if local_counter%10==0:
          local_counter = 0
          global_counter+=10
          with open(f"./courses_{global_counter}.txt", "wb") as fp:
            pickle.dump(list_json, fp)
          list_json = []
          print("Stored {} results!".format(global_counter))
    except:
        print(global_counter)
        continue
  else:
    global_counter+=10
    print("Stored {} results!".format(global_counter))

image.png

import glob
all_chunks = glob.glob("./*.txt")

list_json = []

for chunk in all_chunks:
  with open(chunk, "rb") as fp:
    list_json.extend(pickle.load(fp))
len(list_json)
11160
df_courses = pd.DataFrame.from_dict(list_json)
df_courses.to_csv('df_courses.csv')
df_courses.head()
_class id title url is_paid price price_detail price_serve_tracking_id visible_instructors image_125_H image_240x135 is_practice_test_course image_480x270 published_title tracking_id description headline num_subscribers caption_locales discount discount_price avg_rating avg_rating_recent rating num_reviews num_reviews_recent rating_distribution favorite_time archive_time earnings completion_ratio is_wishlisted is_saved num_quizzes num_lectures num_published_lectures num_published_quizzes num_curriculum_items num_of_published_curriculum_objects num_cpe_credits ... prerequisites objectives objectives_summary target_audiences last_accessed_time enrollment_time course_has_labels bestseller_badge_content badges free_course_subscribe_url is_recently_published last_update_date num_article_assets num_coding_exercises num_assignments num_additional_assets preview_url landing_preview_as_guest_url context_info has_sufficient_preview_length has_org_only_setting is_draft common_review_attributes subscription_locale custom_category_ids alternate_redirect_course_id is_approved is_organization_eligible instructor_status available_features enroll_url learn_url predictive_score relevancy_score input_features lecture_search_result curriculum_lectures order_in_results curriculum_items instructor_name
0 course 171838 Secret Sauce of Great Writing /course/secret-sauce-of-great-writing/ False Free None cUyKLjR5SyaGS7yPGXyBYg [{'_class': 'user', 'title': 'Shani Raja', 'na... https://img-c.udemycdn.com/course/125_H/171838... https://img-c.udemycdn.com/course/240x135/1718... False https://img-c.udemycdn.com/course/480x270/1718... secret-sauce-of-great-writing uVW9mYxVQrKKxgsWkhKARA <p><strong><em>SECRET&nbsp;SAUCE&nbsp;OF&nbsp;... Ex-Wall Street Journal Editor Teaches How To M... 187239 [{'_class': 'locale', 'locale': 'en_US', 'titl... None None 4.483728 4.485416 4.485416 13806 339 [{'count': 125, 'rating': 1}, {'count': 271, '... None None $0.00 0 False False 0 17 15 0 17 15 NaN ... [A computer for doing writing and editing exer... [In this course, Shani Raja, a former Wall Str... [In this course, Shani Raja, a former Wall Str... [The course will interest professionals keen t... None None [{'_class': 'course_has_label', 'id': 47066, '... None [] /course/subscribe/?courseId=171838 False None 0 0 0 1 /course/171838/preview/ /course/secret-sauce-of-great-writing/?instruc... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 533, 'id': 42, 'applies_to': 'i... None [] 0 True True {'_class': 'instructor_course_status', 'id': 3... [] /course/171838/enroll/ /course/secret-sauce-of-great-writing/learn/ None None None None [] None [] None
1 course 780078 How to speak to anyone & be fearless - in less... /course/fearless-public-speaker-bootcamp-by-ri... False Free None 7PfKpSZFR5OJBxxGdxmlVw [{'_class': 'user', 'title': 'Ricardo Mendoza'... https://img-c.udemycdn.com/course/125_H/780078... https://img-c.udemycdn.com/course/240x135/7800... False https://img-c.udemycdn.com/course/480x270/7800... fearless-public-speaker-bootcamp-by-ricky-mendoza q5HZyGmtQY-ilQWHcnv08w <p><strong>The goals of this course are simple... Learn to quickly connect with crowds of strang... 129594 [{'_class': 'locale', 'locale': 'en_US', 'titl... None None 4.452830 4.460957 4.460957 8198 381 [{'count': 132, 'rating': 1}, {'count': 248, '... None None $0.00 0 False False 0 23 23 0 23 23 NaN ... [Not much! :) If you communicate with anyone, ... [Speak fearlessly to a crowd (or face to face)... [Speak fearlessly to a crowd (or face to face)... [This course is meant for anyone wanting to co... None None [{'_class': 'course_has_label', 'id': 180108, ... None [] /course/subscribe/?courseId=780078 False None 0 0 0 0 /course/780078/preview/ /course/fearless-public-speaker-bootcamp-by-ri... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 77, 'id': 42, 'applies_to': 'in... None [] 0 True True None [] /course/780078/enroll/ /course/fearless-public-speaker-bootcamp-by-ri... None None None None [] None [] None
2 course 352440 Agile PM 101 - Learn the Truth About Agile ver... /course/learn-the-truth-about-agile-versus-wat... False Free None vaPJIvlbQi62V9KnDNj2yQ [{'_class': 'user', 'title': 'Agile Project Ma... https://img-c.udemycdn.com/course/125_H/352440... https://img-c.udemycdn.com/course/240x135/3524... False https://img-c.udemycdn.com/course/480x270/3524... learn-the-truth-about-agile-versus-waterfall Tq8tmHZZT5WBKZNMaXFXzQ <p><strong>Important:</strong><br>This course ... Learn to See Agile and Waterfall in a Fresh, N... 107450 [] None None 4.509386 4.539074 4.539074 14781 815 [{'count': 47, 'rating': 1}, {'count': 191, 'r... None None $0.00 0 False False 7 13 13 7 20 20 NaN ... [None] [Learn the truth about the relationship of "Ag... [Learn the truth about the relationship of "Ag... [Executives and Business Managers who want to ... None None [{'_class': 'course_has_label', 'id': 22478, '... None [] /course/subscribe/?courseId=352440 False 2020-04-09 2 0 0 19 /course/352440/preview/ /course/learn-the-truth-about-agile-versus-wat... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 458, 'id': 42, 'applies_to': 'i... None [] 0 True True None [] /course/352440/enroll/ /course/learn-the-truth-about-agile-versus-wat... None None None None [] None [] None
3 course 576504 Learn Power BI Basics for Free /course/learn_power_bi_for_free/ False Free None R0N2ANN2Sq-eY9AGldxSTQ [{'_class': 'user', 'title': 'Vishal Pawar', '... https://img-c.udemycdn.com/course/125_H/576504... https://img-c.udemycdn.com/course/240x135/5765... False https://img-c.udemycdn.com/course/480x270/5765... learn_power_bi_for_free SF3YC3o3SvGLXLWJ3x1gkg <p style="">Learn <strong style="">Power BI</s... In this tutorial you will be learning all basi... 93112 [{'_class': 'locale', 'locale': 'en_US', 'titl... None None 4.117857 4.070692 4.070692 4246 141 [{'count': 276, 'rating': 1}, {'count': 306, '... None None $0.00 0 False False 0 21 20 0 21 20 NaN ... [You should know terms Query, Excel :)] [Learn all basics of Power BI, Learn Dashboard... [Learn all basics of Power BI, Learn Dashboard... [Everyone who play with Data, Developer, IT P... None None [{'_class': 'course_has_label', 'id': 108500, ... None [] /course/subscribe/?courseId=576504 False None 0 0 0 6 /course/576504/preview/ /course/learn_power_bi_for_free/?instructorPre... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 24, 'id': 28, 'applies_to': 'co... None [] 0 True True None [] /course/576504/enroll/ /course/learn_power_bi_for_free/learn/ None None None None [] None [] None
4 course 81534 Make Money from Home: How to Build an Online B... /course/how-to-build-an-online-business-a-comp... False Free None aoVyKdX0RIC190Iz_WqJxQ [{'_class': 'user', 'title': 'mike omar', 'nam... https://img-c.udemycdn.com/course/125_H/81534_... https://img-c.udemycdn.com/course/240x135/8153... False https://img-c.udemycdn.com/course/480x270/8153... how-to-build-an-online-business-a-complete-bus... rUWVrrbART6k9w2NZX-qEw <p> \t HOW TO MAKE MONE... Learn all the basics of online entrepreneurshi... 221348 [{'_class': 'locale', 'locale': 'en_US', 'titl... None None 4.490196 4.493055 4.493055 5432 46 [{'count': 93, 'rating': 1}, {'count': 131, 'r... None None $0.00 0 False False 0 24 24 0 24 24 NaN ... [None.] [To build a $5,000 / month passive income webs... [To build a $5,000 / month passive income webs... [Intended for anyone interested in making mone... None None [{'_class': 'course_has_label', 'id': 81066, '... None [] /course/subscribe/?courseId=81534 False None 2 0 0 0 /course/81534/preview/ /course/how-to-build-an-online-business-a-comp... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 92, 'id': 46, 'applies_to': 'in... None [] 0 True True None [] /course/81534/enroll/ /course/how-to-build-an-online-business-a-comp... None None None None [] None [] None

5 rows × 150 columns

df_courses.describe(include=['O'])
_class title url price price_detail price_serve_tracking_id visible_instructors image_125_H image_240x135 image_480x270 published_title tracking_id description headline caption_locales discount discount_price rating_distribution favorite_time archive_time earnings original_price_text quality_status status_label features gift_url notification_settings intended_category image_48x27 image_50x50 image_75x75 image_96x54 image_100x100 image_200_H image_304x171 image_750x422 primary_category primary_subcategory locale caption_languages ... apple_in_app_purchase_price_text type_label google_in_app_price_detail apple_in_app_price_detail client_settings quality_review_process cpe_field_of_study cpe_program_level buyable_object_type published_time checkout_url prerequisites objectives objectives_summary target_audiences last_accessed_time enrollment_time course_has_labels bestseller_badge_content badges free_course_subscribe_url last_update_date preview_url landing_preview_as_guest_url context_info common_review_attributes subscription_locale custom_category_ids instructor_status available_features enroll_url learn_url predictive_score relevancy_score input_features lecture_search_result curriculum_lectures order_in_results curriculum_items instructor_name
count 11160 11160 11160 11160 10216 11160 11160 11160 11160 11160 11160 11160 11160 11160 11160 9389 9389 11160 0 0 11160 11160 11160 11160 11160 11160 11160 7376 11160 11160 11160 11160 11160 11160 11160 11160 11160 11160 11160 11160 ... 10216 11160 10216 10216 11160 11160 16 16 11160 11160 11160 11160 11160 11160 11160 0 0 11160 441 11160 944 8651 10859 11160 11160 11160 0 11160 2091 11160 11160 11160 0 0 0 0 11160 0 11160 0
unique 1 7555 7568 30 29 9784 4663 7568 7568 7568 7568 11160 7560 7479 182 8235 15 4848 0 0 1 195 1 1 5 7568 1 14 7568 7568 7568 7568 7568 7568 7568 7568 1 15 3 79 ... 42 1 42 42 5 6863 3 1 1 7565 7568 6212 7510 7499 7219 0 0 7566 302 861 647 1852 7360 7568 1018 345 0 1 1403 3 7568 7568 0 0 0 0 1 0 1 0
top course Practical Project Management for Management Co... /course/management-consulting-project-d/ $19.99 {'amount': 19.99, 'currency': 'USD', 'price_st... UO5Fj9atSTGHDnztiLQzOA [{'_class': 'user', 'title': 'Sorin Dumitrascu... https://img-c.udemycdn.com/course/125_H/125835... https://img-c.udemycdn.com/course/240x135/1258... https://img-c.udemycdn.com/course/480x270/1258... management-consulting-project-d I1X2GPuvQfiteOQFZpRy2g <p><strong>What is the aim of this course?</st... Project Management Professional Certification ... [{'_class': 'locale', 'locale': 'en_US', 'titl... {'price_serve_tracking_id': '0CAqYbbaQheReCXyz... {'amount': 13.0, 'currency': 'USD', 'price_str... [{'rating': 1, 'count': 0}, {'rating': 2, 'cou... NaN NaN $0.00 $19.99 approved Live {'_class': 'course', 'discussions_create': Tru... https://www.udemy.com/gift/management-consulti... {} {'_class': 'course_category', 'id': 268, 'titl... https://img-c.udemycdn.com/course/48x27/125835... https://img-c.udemycdn.com/course/50x50/125835... https://img-c.udemycdn.com/course/75x75/125835... https://img-c.udemycdn.com/course/96x54/125835... https://img-c.udemycdn.com/course/100x100/1258... https://img-c.udemycdn.com/course/200_H/125835... https://img-c.udemycdn.com/course/304x171/1258... https://img-c.udemycdn.com/course/750x422/1258... {'_class': 'course_category', 'id': 268, 'titl... {'_class': 'course_subcategory', 'id': 26, 'ti... {'_class': 'locale', 'locale': 'en_US', 'title... [English [Auto]] ... $12.99 course {'amount': 12.99, 'currency': 'USD', 'price_st... {'amount': 12.99, 'currency': 'USD', 'price_st... {'machine_cc_enabled': False, 'cpe_program_lev... {'status': None, 'score': None, 'admin_rating'... Communications & Marketing - Non-Technical Basic course 2017-09-12T16:09:11Z /cart/checkout/express/course/1258358/?discoun... [None] [] [] [] NaN NaN [{'_class': 'course_has_label', 'id': 220214, ... {'_class': 'ds_course_badge', 'id': 'e7bdb5303... [] /course/subscribe/?courseId=1303808 2021-07-01 /course/1258358/preview/ /course/management-consulting-project-d/?instr... {'category': {'id': 268, 'tracking_object_type... [] NaN [] {'_class': 'instructor_course_status', 'id': 4... [q_and_a_enabled, certificate] /course/1258358/enroll/ /course/management-consulting-project-d/learn/ NaN NaN NaN NaN [] NaN [] NaN
freq 11160 6 6 2546 2546 4 79 6 6 6 6 1 6 23 8573 4 7948 1351 NaN NaN 11160 2186 11160 11160 10729 6 11160 5384 6 6 6 6 6 6 6 6 11160 2313 10084 8583 ... 7948 11160 7948 7948 11105 1031 9 16 11160 6 6 322 27 27 32 NaN NaN 6 5 9912 4 72 6 6 383 10515 NaN 11160 5 9823 6 6 NaN NaN NaN NaN 11160 NaN 11160 NaN

4 rows × 94 columns

Import the reviews

For these courses, I downloaded the available reviews. The maximum number of available reviews for a course is 10.000.

local_counter = 0
global_counter = 0

for j, id_ in enumerate(df_courses['id'].values):    
    url="https://www.udemy.com/api-2.0/courses/{}/reviews/?page=1&page_size=100".format(id_)
    list_json_review=[]
    while url!=None:
        try:
          data_json=get_data(url, username, pw)
          url=data_json['next']
          list_json_review.extend(data_json['results'])
          if local_counter%100==0:
            local_counter = 0
            global_counter+=100
            with open(f"reviews_{global_counter}.txt", "wb") as fp:
              pickle.dump(list_json_review, fp)
            list_json_review = []
            print("Stored {} results!".format(global_counter))
        except:
        	continue
    if j==0:
        df_review= pd.DataFrame.from_dict(list_json_review)
        df_review['id']=id_
    else:
        df_review_unique = pd.DataFrame.from_dict(list_json_review)
        df_review_unique['id']=id_
        df_review = pd.concat([df_review, df_review_unique])

image.png

import glob
all_chunks = glob.glob("./Udemyreviews/*.txt")

list_json_review = []

for chunk in all_chunks:
  with open(chunk, "rb") as fp:
    list_json_review.extend(pickle.load(fp))
df_review = pd.DataFrame.from_dict(list_json_review)
df_review.to_csv('df_review.csv')

Data Persistence

We will store these 2 datasets in parquet copressed format for efficient and quick loading in future.

df_courses.to_parquet('udemy_courses.parquet.gzip', compression='gzip')
df_review.to_parquet('udemy_reviews.parquet.gzip', compression='gzip')

Data Cleaning

Through the data cleaning process I did the following operations on the raw dataset:

  1. import the raw data
  2. transform the relevant columns
  3. filter the dataset
  4. keep only the relevant columns
  5. drop the duplicates
  6. treat the missing values
  7. save the cleaned data

Cleaning the course data

df_courses = pd.read_parquet("https://github.com/sparsh-ai/reco-data/raw/master/udemy/udemy_courses.parquet.gzip")
df_courses.shape
(11160, 150)
df_courses.head()
_class id title url is_paid price price_detail price_serve_tracking_id visible_instructors image_125_H image_240x135 is_practice_test_course image_480x270 published_title tracking_id description headline num_subscribers caption_locales discount discount_price avg_rating avg_rating_recent rating num_reviews num_reviews_recent rating_distribution favorite_time archive_time earnings completion_ratio is_wishlisted is_saved num_quizzes num_lectures num_published_lectures num_published_quizzes num_curriculum_items num_of_published_curriculum_objects num_cpe_credits ... prerequisites objectives objectives_summary target_audiences last_accessed_time enrollment_time course_has_labels bestseller_badge_content badges free_course_subscribe_url is_recently_published last_update_date num_article_assets num_coding_exercises num_assignments num_additional_assets preview_url landing_preview_as_guest_url context_info has_sufficient_preview_length has_org_only_setting is_draft common_review_attributes subscription_locale custom_category_ids alternate_redirect_course_id is_approved is_organization_eligible instructor_status available_features enroll_url learn_url predictive_score relevancy_score input_features lecture_search_result curriculum_lectures order_in_results curriculum_items instructor_name
0 course 171838 Secret Sauce of Great Writing /course/secret-sauce-of-great-writing/ False Free None cUyKLjR5SyaGS7yPGXyBYg [{'_class': 'user', 'title': 'Shani Raja', 'na... https://img-c.udemycdn.com/course/125_H/171838... https://img-c.udemycdn.com/course/240x135/1718... False https://img-c.udemycdn.com/course/480x270/1718... secret-sauce-of-great-writing uVW9mYxVQrKKxgsWkhKARA <p><strong><em>SECRET&nbsp;SAUCE&nbsp;OF&nbsp;... Ex-Wall Street Journal Editor Teaches How To M... 187239 [{'_class': 'locale', 'locale': 'en_US', 'titl... None None 4.483728 4.485416 4.485416 13806 339 [{'count': 125, 'rating': 1}, {'count': 271, '... NaN NaN $0.00 0 False False 0 17 15 0 17 15 NaN ... ['A computer for doing writing and editing exe... ['In this course, Shani Raja, a former Wall St... ['In this course, Shani Raja, a former Wall St... ['The course will interest professionals keen ... NaN NaN [{'_class': 'course_has_label', 'id': 47066, '... None [] /course/subscribe/?courseId=171838 False None 0 0 0 1 /course/171838/preview/ /course/secret-sauce-of-great-writing/?instruc... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 533, 'id': 42, 'applies_to': 'i... NaN [] 0 True True {'_class': 'instructor_course_status', 'id': 3... [] /course/171838/enroll/ /course/secret-sauce-of-great-writing/learn/ NaN NaN NaN NaN [] NaN [] NaN
1 course 780078 How to speak to anyone & be fearless - in less... /course/fearless-public-speaker-bootcamp-by-ri... False Free None 7PfKpSZFR5OJBxxGdxmlVw [{'_class': 'user', 'title': 'Ricardo Mendoza'... https://img-c.udemycdn.com/course/125_H/780078... https://img-c.udemycdn.com/course/240x135/7800... False https://img-c.udemycdn.com/course/480x270/7800... fearless-public-speaker-bootcamp-by-ricky-mendoza q5HZyGmtQY-ilQWHcnv08w <p><strong>The goals of this course are simple... Learn to quickly connect with crowds of strang... 129594 [{'_class': 'locale', 'locale': 'en_US', 'titl... None None 4.452830 4.460957 4.460957 8198 381 [{'count': 132, 'rating': 1}, {'count': 248, '... NaN NaN $0.00 0 False False 0 23 23 0 23 23 NaN ... ["Not much! :) If you communicate with anyone,... ['Speak fearlessly to a crowd (or face to face... ['Speak fearlessly to a crowd (or face to face... ['This course is meant for anyone wanting to c... NaN NaN [{'_class': 'course_has_label', 'id': 180108, ... None [] /course/subscribe/?courseId=780078 False None 0 0 0 0 /course/780078/preview/ /course/fearless-public-speaker-bootcamp-by-ri... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 77, 'id': 42, 'applies_to': 'in... NaN [] 0 True True None [] /course/780078/enroll/ /course/fearless-public-speaker-bootcamp-by-ri... NaN NaN NaN NaN [] NaN [] NaN
2 course 352440 Agile PM 101 - Learn the Truth About Agile ver... /course/learn-the-truth-about-agile-versus-wat... False Free None vaPJIvlbQi62V9KnDNj2yQ [{'_class': 'user', 'title': 'Agile Project Ma... https://img-c.udemycdn.com/course/125_H/352440... https://img-c.udemycdn.com/course/240x135/3524... False https://img-c.udemycdn.com/course/480x270/3524... learn-the-truth-about-agile-versus-waterfall Tq8tmHZZT5WBKZNMaXFXzQ <p><strong>Important:</strong><br>This course ... Learn to See Agile and Waterfall in a Fresh, N... 107450 [] None None 4.509386 4.539074 4.539074 14781 815 [{'count': 47, 'rating': 1}, {'count': 191, 'r... NaN NaN $0.00 0 False False 7 13 13 7 20 20 NaN ... ['None'] ['Learn the truth about the relationship of "A... ['Learn the truth about the relationship of "A... ['Executives and Business Managers who want to... NaN NaN [{'_class': 'course_has_label', 'id': 22478, '... None [] /course/subscribe/?courseId=352440 False 2020-04-09 2 0 0 19 /course/352440/preview/ /course/learn-the-truth-about-agile-versus-wat... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 458, 'id': 42, 'applies_to': 'i... NaN [] 0 True True None [] /course/352440/enroll/ /course/learn-the-truth-about-agile-versus-wat... NaN NaN NaN NaN [] NaN [] NaN
3 course 576504 Learn Power BI Basics for Free /course/learn_power_bi_for_free/ False Free None R0N2ANN2Sq-eY9AGldxSTQ [{'_class': 'user', 'title': 'Vishal Pawar', '... https://img-c.udemycdn.com/course/125_H/576504... https://img-c.udemycdn.com/course/240x135/5765... False https://img-c.udemycdn.com/course/480x270/5765... learn_power_bi_for_free SF3YC3o3SvGLXLWJ3x1gkg <p style="">Learn <strong style="">Power BI</s... In this tutorial you will be learning all basi... 93112 [{'_class': 'locale', 'locale': 'en_US', 'titl... None None 4.117857 4.070692 4.070692 4246 141 [{'count': 276, 'rating': 1}, {'count': 306, '... NaN NaN $0.00 0 False False 0 21 20 0 21 20 NaN ... ['You should know terms Query, Excel :)'] ['Learn all basics of Power BI', 'Learn Dashbo... ['Learn all basics of Power BI', 'Learn Dashbo... ['Everyone who play with Data', 'Developer, IT... NaN NaN [{'_class': 'course_has_label', 'id': 108500, ... None [] /course/subscribe/?courseId=576504 False None 0 0 0 6 /course/576504/preview/ /course/learn_power_bi_for_free/?instructorPre... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 24, 'id': 28, 'applies_to': 'co... NaN [] 0 True True None [] /course/576504/enroll/ /course/learn_power_bi_for_free/learn/ NaN NaN NaN NaN [] NaN [] NaN
4 course 81534 Make Money from Home: How to Build an Online B... /course/how-to-build-an-online-business-a-comp... False Free None aoVyKdX0RIC190Iz_WqJxQ [{'_class': 'user', 'title': 'mike omar', 'nam... https://img-c.udemycdn.com/course/125_H/81534_... https://img-c.udemycdn.com/course/240x135/8153... False https://img-c.udemycdn.com/course/480x270/8153... how-to-build-an-online-business-a-complete-bus... rUWVrrbART6k9w2NZX-qEw <p> \t HOW TO MAKE MONE... Learn all the basics of online entrepreneurshi... 221348 [{'_class': 'locale', 'locale': 'en_US', 'titl... None None 4.490196 4.493055 4.493055 5432 46 [{'count': 93, 'rating': 1}, {'count': 131, 'r... NaN NaN $0.00 0 False False 0 24 24 0 24 24 NaN ... ['None.'] ['To build a $5,000 / month passive income web... ['To build a $5,000 / month passive income web... ['Intended for anyone interested in making mon... NaN NaN [{'_class': 'course_has_label', 'id': 81066, '... None [] /course/subscribe/?courseId=81534 False None 2 0 0 0 /course/81534/preview/ /course/how-to-build-an-online-business-a-comp... {'category': {'title': 'Business', 'url': '/co... True False False [{'num_votes': 92, 'id': 46, 'applies_to': 'in... NaN [] 0 True True None [] /course/81534/enroll/ /course/how-to-build-an-online-business-a-comp... NaN NaN NaN NaN [] NaN [] NaN

5 rows × 150 columns

df_courses.columns.tolist()
['_class',
 'id',
 'title',
 'url',
 'is_paid',
 'price',
 'price_detail',
 'price_serve_tracking_id',
 'visible_instructors',
 'image_125_H',
 'image_240x135',
 'is_practice_test_course',
 'image_480x270',
 'published_title',
 'tracking_id',
 'description',
 'headline',
 'num_subscribers',
 'caption_locales',
 'discount',
 'discount_price',
 'avg_rating',
 'avg_rating_recent',
 'rating',
 'num_reviews',
 'num_reviews_recent',
 'rating_distribution',
 'favorite_time',
 'archive_time',
 'earnings',
 'completion_ratio',
 'is_wishlisted',
 'is_saved',
 'num_quizzes',
 'num_lectures',
 'num_published_lectures',
 'num_published_quizzes',
 'num_curriculum_items',
 'num_of_published_curriculum_objects',
 'num_cpe_credits',
 'is_private',
 'num_practice_tests',
 'num_published_practice_tests',
 'original_price_text',
 'quality_status',
 'status_label',
 'can_edit',
 'features',
 'gift_url',
 'num_invitation_requests',
 'notification_settings',
 'is_banned',
 'is_published',
 'intended_category',
 'image_48x27',
 'image_50x50',
 'image_75x75',
 'image_96x54',
 'image_100x100',
 'image_200_H',
 'image_304x171',
 'image_750x422',
 'has_certificate',
 'primary_category',
 'primary_subcategory',
 'is_enrollable_on_mobile',
 'is_in_any_ufb_content_collection',
 'is_in_user_subscription',
 'is_in_subscribed_content_collections',
 'locale',
 'has_closed_caption',
 'caption_languages',
 'created',
 'instructional_level',
 'instructional_level_simple',
 'estimated_content_length',
 'content_info',
 'content_info_short',
 'content_length_practice_test_questions',
 'requirements_data',
 'what_you_will_learn_data',
 'who_should_attend_data',
 'is_available_on_google_app',
 'organization_id',
 'google_in_app_purchase_price_text',
 'promo_asset',
 'is_user_subscribed',
 'apple_in_app_product_id',
 'is_available_on_ios',
 'google_in_app_product_id',
 'faq',
 'apple_in_app_purchase_price_text',
 'type_label',
 'google_in_app_price_detail',
 'apple_in_app_price_detail',
 'client_settings',
 'quality_review_process',
 'is_organization_only',
 'is_cpe_compliant',
 'cpe_field_of_study',
 'cpe_program_level',
 'was_ever_published',
 'buyable_object_type',
 'published_time',
 'is_marketing_boost_agreed',
 'is_owned_by_instructor_team',
 'is_owner_terms_banned',
 'is_taking_disabled',
 'content_length_video',
 'checkout_url',
 'prerequisites',
 'objectives',
 'objectives_summary',
 'target_audiences',
 'last_accessed_time',
 'enrollment_time',
 'course_has_labels',
 'bestseller_badge_content',
 'badges',
 'free_course_subscribe_url',
 'is_recently_published',
 'last_update_date',
 'num_article_assets',
 'num_coding_exercises',
 'num_assignments',
 'num_additional_assets',
 'preview_url',
 'landing_preview_as_guest_url',
 'context_info',
 'has_sufficient_preview_length',
 'has_org_only_setting',
 'is_draft',
 'common_review_attributes',
 'subscription_locale',
 'custom_category_ids',
 'alternate_redirect_course_id',
 'is_approved',
 'is_organization_eligible',
 'instructor_status',
 'available_features',
 'enroll_url',
 'learn_url',
 'predictive_score',
 'relevancy_score',
 'input_features',
 'lecture_search_result',
 'curriculum_lectures',
 'order_in_results',
 'curriculum_items',
 'instructor_name']
df = df_courses.copy()
df['primary_category']=transform_col(df['primary_category'], 'title')
df['primary_subcategory']=transform_col(df['primary_subcategory'], 'title')
df['content_info']=df['content_info'].apply(get_float)
df['price']=df['price'].apply(get_float)
df['published_time']=pd.to_datetime(df['published_time']).dt.tz_convert(None)
df['published_since_month']=(datetime.datetime.now()-df['published_time']).apply(lambda x: int(x.days/30))
df['objectives']=transform_col(df['objectives'])
df['description'] = df['description'].fillna('description not available')
df['description_text']=df['description'].apply(remove_tags)
rating_orig=[]
rating_rel=[]
for i, rating in enumerate(df['rating_distribution'].values):
    total=0
    temp={}
    temp_rel={}
    if rating:
        rating=ast.literal_eval(rating)
        for rating_j in rating:
            j=rating_j['rating']
            count_j=rating_j['count']
            total+=count_j
            temp[j]=count_j
        rating_orig.append(temp)
        if total>0:
            for k,v in temp.items():
                temp_rel[k]=round(v*1.0/total,3)
            rating_rel.append(temp_rel)
        else:
            rating_rel.append({1:0, 2:0, 3:0, 4:0, 5:0})
    else:
        rating_rel.append({1:0, 2:0, 3:0, 4:0, 5:0})
        rating_orig.append({1:0, 2:0, 3:0, 4:0, 5:0})
df_rating=pd.DataFrame(rating_rel)
df_rating.columns=['rating_1', 'rating_2', 'rating_3', 'rating_4','rating_5']
df=pd.concat([df, df_rating], axis=1)
df.shape
(11160, 157)
df=df[(df['is_published']== True ) & (df['status_label']== 'Live')]
#drop the columns that are transformed or not relevant any more
df.drop(columns=['published_time','rating_distribution','status_label', 'is_published', 'rating', 'description' ], axis=1, inplace=True)
df.shape
(11160, 151)
cols=['avg_rating', 'avg_rating_recent', 'description_text', 'has_certificate',  'is_paid',
      'id', 'instructional_level', 'is_enrollable_on_mobile', 'is_owned_by_instructor_team', 'is_practice_test_course', 
      'num_article_assets' , 'num_curriculum_items','num_lectures', 'num_practice_tests', 'num_quizzes',
      'num_subscribers', 'num_reviews', 'objectives', 'price','published_title', 'relevancy_score','rating_1', 
      'rating_2', 'rating_3', 'rating_4','rating_5', 'published_since_month', 'primary_category', 'primary_subcategory' ]
df=df[cols]
df.shape
(11160, 29)
df=df.drop_duplicates(subset='id', keep='first')
df.shape
(7568, 29)
df.isnull().sum()
avg_rating                        0
avg_rating_recent                 0
description_text                  0
has_certificate                   0
is_paid                           0
id                                0
instructional_level               1
is_enrollable_on_mobile           0
is_owned_by_instructor_team       0
is_practice_test_course           0
num_article_assets                0
num_curriculum_items              0
num_lectures                      0
num_practice_tests                0
num_quizzes                       0
num_subscribers                   0
num_reviews                       0
objectives                        0
price                           647
published_title                   0
relevancy_score                7568
rating_1                          0
rating_2                          0
rating_3                          0
rating_4                          0
rating_5                          0
published_since_month             0
primary_category                  0
primary_subcategory               0
dtype: int64
#The free courses are labeled as free -> change price for these courses: 0 
df['price']=df['price'].fillna(0)

# drop relevancy_score
df = df.drop('relevancy_score', axis=1)

#drop the missings
df.dropna(how='any', inplace=True)

#in the objectives, there are empty lists 
index_to_drop=df[df['objectives'].apply(lambda x: x==list([]))].index
df.drop(index=index_to_drop, inplace=True)
df.shape
(7550, 28)
df.to_csv('df_courses.csv', sep=' ')

Clean the review data

df_review_raw = pd.read_parquet("https://github.com/sparsh-ai/reco-data/raw/master/udemy/udemy_reviews.parquet.gzip")
df_review_raw.shape
(146955, 8)
df_review_raw.head()
_class id content rating created modified user_modified user
0 course_review 37926532 None 5.0 2019-09-26T14:14:13Z 2019-09-26T19:07:06Z 2019-09-26T14:14:13Z {'_class': 'user', 'title': 'Ryan McGovern', '...
1 course_review 37914794 None 3.0 2019-09-26T09:04:09Z 2019-09-26T19:07:06Z 2019-09-26T09:04:10Z {'_class': 'user', 'title': 'Mohammad Rashad N...
2 course_review 37851504 None 5.0 2019-09-25T07:33:44Z 2019-09-25T18:04:15Z 2019-09-25T07:33:47Z {'_class': 'user', 'title': 'Arth Patel', 'nam...
3 course_review 37835876 khjhjk 5.0 2019-09-25T02:08:23Z 2019-09-25T11:09:35Z 2019-09-25T02:08:23Z {'_class': 'user', 'title': 'A Kowsar Parveen'...
4 course_review 37834428 NVBNVBNVBN 5.0 2019-09-25T01:30:27Z 2019-09-25T11:09:35Z 2019-09-25T01:30:27Z {'_class': 'user', 'title': 'A Nirmala', 'name...
df_review=df_review_raw.loc[:, ~df_review_raw.columns.str.match('Unnamed')]
df_review['user_name']=transform_col(df_review['user'], 'display_name')
df_review['user_title']=transform_col(df_review['user'], 'title')
df_review=df_review[~df_review['user_name'].isin(['Anonymized User', 'Private Udemy For Business User', 'Udemy User'])]
cols=['id', 'created', 'rating', 'user_name']
df_review=df_review[cols]
#the user names in the reviews data are not unique, it is impossible to build a recommender system based on the user ratings
df_review.drop_duplicates(inplace=True)
df_review.isnull().sum()
id           0
created      0
rating       0
user_name    0
dtype: int64
#no missing values
df_review.to_csv('df_reviews.csv')

EDA

Most important findings on the course dataset:

  • there are courses with no reviews/ratings, but most of them are between rating 4 and 4.5
  • The price ranges between 0 and 199 EUR
  • There are some really popular courses with a lot of subscribers. The top 3 are:
      - machinelearning   with more than 300T subscribers
      - python-for-data-science-and-machine-learning-bootcamp with 192T subscribers
      - an-entire-mba-in-1-courseaward-winning-business-school-prof with 187T subscribers
  • Most courses don't have any quizzes or practice tests
  • The number of lectures mostly vary between 13 and 37 (IQR)
  • The average age of a course is 26 months (since it was published). There are more recently published courses than older ones.
  • The majority of the courses is for all levels. Only a few courses requires an advanced level.
  • The courses are divided into 16 subcategories, whereas the two most significant are Finance and Entrepreneurship.
      - Two subcategories have an average price higher than 100 dollars : The subcategory Data & analytics with 112, and Project Management with 104
      - The total earning on the courses is the highest in the subcategory for Data & Analytics and the second is in Entrepreneurship.
      - The total number of subscribers are the highest in the category of Entrepreneurship (1.) and in Data & Analytics (2.)
      - There is not much difference between the average ratings of the courses in each subcategory. The highest average ratings are in the subcategories Media and Communications. 
  • I investigated the top words in each subcategories in the attributes objectives and description separately. E.g. in the subcategory Data& Analytics, the top 5 words are:
    • data, use, model, understand, create

After the univariate analysis I also executed multivariate analysis:

  • There is a positive correlation between the number of reviews/number of subscribers and the average rating - students normally give good ratings for courses they liked
  • As expected, there is a positive correlation between number of subscribers and number of reviews
  • There is also a positive correlation between published since and the average rating -> older courses have better ratings. This seems logic, since I would expect that courses which aren't popular won't stay long on the sortiment
  • The price doesn't have an effect on the average ratings or on the number of subsribers

Most important findings on the reviews dataset:

  • The users are unfortunately not unique. Because if this reason, it is not possible to build a recommender system on the user ratings.
    • Most users (more than 600.000) gave only one review, but there are couple user_names, who have plenty of reviews: the most common username is David with more than 400 reviews.
  • Most courses have very few reviews
import pandas as pd
import numpy as np
import ast
import scipy.stats as st

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

#for the text attributes
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from operator import itemgetter
from nltk.stem import SnowballStemmer

EDA of the course dataset

df_courses = pd.read_csv('df_courses.csv', index_col=0, sep=' ', converters={"objectives": ast.literal_eval})
df_courses.head()
avg_rating avg_rating_recent description_text has_certificate is_paid id instructional_level is_enrollable_on_mobile is_owned_by_instructor_team is_practice_test_course num_article_assets num_curriculum_items num_lectures num_practice_tests num_quizzes num_subscribers num_reviews objectives price published_title rating_1 rating_2 rating_3 rating_4 rating_5 published_since_month primary_category primary_subcategory
0 4.483728 4.485416 SECRET&nbsp;SAUCE&nbsp;OF&nbsp;GREAT&nbsp;WRIT... False False 171838 Beginner Level True False False 0 17 17 0 0 187239 13806 [In this course, Shani Raja, a former Wall Str... 0.0 secret-sauce-of-great-writing 0.009 0.020 0.124 0.359 0.489 89 Business Communication
1 4.452830 4.460957 The goals of this course are simple: To learn... False False 780078 All Levels True False False 0 23 23 0 0 129594 8198 [Speak fearlessly to a crowd (or face to face)... 0.0 fearless-public-speaker-bootcamp-by-ricky-mendoza 0.016 0.030 0.178 0.389 0.385 65 Business Communication
2 4.509386 4.539074 Important:This course is part of an integrated... False False 352440 All Levels True False False 2 20 13 0 7 107450 14781 [Learn the truth about the relationship of "Ag... 0.0 learn-the-truth-about-agile-versus-waterfall 0.003 0.013 0.123 0.440 0.421 80 Business Project Management
3 4.117857 4.070692 Learn Power BI Basics for Free !Power BI trans... False False 576504 Beginner Level True False False 0 21 21 0 0 93112 4246 [Learn all basics of Power BI, Learn Dashboard... 0.0 learn_power_bi_for_free 0.066 0.073 0.245 0.334 0.282 71 Business Business Analytics & Intelligence
4 4.490196 4.493055 HOW TO MAKE MONEY ON... False False 81534 All Levels True False False 2 24 24 0 0 221348 5432 [To build a $5,000 / month passive income webs... 0.0 how-to-build-an-online-business-a-complete-bus... 0.017 0.024 0.120 0.306 0.532 96 Business E-Commerce

Numerical columns

df_courses.describe()
avg_rating avg_rating_recent id num_article_assets num_curriculum_items num_lectures num_practice_tests num_quizzes num_subscribers num_reviews price rating_1 rating_2 rating_3 rating_4 rating_5 published_since_month
count 7550.000000 7550.000000 7.550000e+03 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000 7550.000000
mean 3.722400 3.708659 2.194797e+06 1.838411 31.658940 29.803841 0.166358 1.565298 3635.219603 282.517881 54.756413 0.028571 0.036734 0.133282 0.265107 0.414574 31.973907
std 1.458542 1.459689 1.232601e+06 5.997117 41.514367 40.194758 0.797819 4.218708 11915.657079 1809.965944 40.271189 0.071305 0.083417 0.148180 0.199354 0.262966 26.615780
min 0.000000 0.000000 8.075000e+03 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 3.750000 3.746468 1.167334e+06 0.000000 12.000000 11.000000 0.000000 0.000000 27.000000 4.000000 19.990000 0.000000 0.000000 0.000000 0.111000 0.267000 9.000000
50% 4.250000 4.226052 2.209831e+06 0.000000 21.000000 20.000000 0.000000 0.000000 354.000000 19.000000 44.990000 0.000000 0.008000 0.115000 0.286000 0.429000 25.000000
75% 4.500000 4.507347 3.341070e+06 1.000000 37.000000 35.000000 0.000000 1.000000 2779.500000 82.000000 94.990000 0.028000 0.044000 0.194000 0.376000 0.570000 49.000000
max 5.000000 5.000000 4.177134e+06 163.000000 638.000000 637.000000 6.000000 127.000000 407741.000000 69434.000000 199.990000 1.000000 1.000000 1.000000 1.000000 1.000000 121.000000

There are around 900 courses with no reviews/ratings, but most of the ratings are between rating 4 and 4.5.

plt.hist(df_courses['avg_rating'], bins=50)
plt.xlabel('Number of courses')
plt.ylabel('Average rating')
plt.title('Distribution of average rating')
plt.savefig('avg_rating.png')

The price ranges between 0 and 199 EUR. Most courses cost eiter 19.99 or 199.99 $.

plt.hist(df_courses['price'], bins=50)
plt.xlabel('Number of courses')
plt.ylabel('Price')
plt.title('Distribution of price')
plt.savefig('price.png')

I checked which courses are most visited. Courses with the top 10 most subscribers can be seen below:

top10_courses= df_courses.sort_values('num_subscribers', ascending=False)[['published_title', 'num_subscribers']].head(10)
for i, row in top10_courses.iterrows():
    print('The course {} has {} subscribers.'.format(row['published_title'],row['num_subscribers']))
The course an-entire-mba-in-1-courseaward-winning-business-school-prof has 407741 subscribers.
The course the-complete-sql-masterclass-for-data-analytics has 325874 subscribers.
The course how-to-build-an-online-business-a-complete-business-plan has 221348 subscribers.
The course pmp-pmbok6-35-pdus has 206294 subscribers.
The course secret-sauce-of-great-writing has 187239 subscribers.
The course powerbi-complete-introduction has 171089 subscribers.
The course the-business-intelligence-analyst-course-2018 has 153522 subscribers.
The course microsoft-power-bi-up-running-with-power-bi-desktop has 140450 subscribers.
The course agile-crash-course has 139350 subscribers.
The course online-business-selling-simple-products-amazon has 136379 subscribers.
fig, ax= plt.subplots(figsize=(8,5))
ax.barh(np.arange(len(top10_courses)), top10_courses['num_subscribers'], alpha=0.6)
plt.yticks(np.arange(len(top10_courses)), top10_courses['published_title'])
plt.title('Top 10 courses with most subscribers')
ax.set_xlabel('Number of subscribers')
plt.savefig('top10courses.png')

I plotted a histogram and a boxplot from each numerical attribute. Some of the features has outliers, and the distribution is skewed.

var_num=['avg_rating', 'avg_rating_recent','num_article_assets' , 'num_curriculum_items',
         'num_lectures', 'num_practice_tests', 'num_quizzes','num_subscribers', 'num_reviews', 'price', 
         'published_since_month', 'rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5']

for col in var_num:
    fig, ax= plt.subplots(1,2,figsize=(16,4))
    ax[0].hist(df_courses[col], bins=20)
    ax[1].boxplot(df_courses[col])
    ax[0].set_title('Distribution of '+ str(col))
    ax[1].set_title('Boxplot of '+ str(col))
    print('Number of 0 values of attribute {} is {}.'.format(col, len(df_courses[df_courses[col]==0])))
Number of 0 values of attribute avg_rating is 917.
Number of 0 values of attribute avg_rating_recent is 917.
Number of 0 values of attribute num_article_assets is 4866.
Number of 0 values of attribute num_curriculum_items is 0.
Number of 0 values of attribute num_lectures is 268.
Number of 0 values of attribute num_practice_tests is 7118.
Number of 0 values of attribute num_quizzes is 5098.
Number of 0 values of attribute num_subscribers is 356.
Number of 0 values of attribute num_reviews is 917.
Number of 0 values of attribute price is 642.
Number of 0 values of attribute published_since_month is 150.
Number of 0 values of attribute rating_1 is 4091.
Number of 0 values of attribute rating_2 is 3680.
Number of 0 values of attribute rating_3 is 2260.
Number of 0 values of attribute rating_4 is 1706.
Number of 0 values of attribute rating_5 is 1312.

I defined all data points, whose distance from the mean is more than 3*standard deviation, as outliers. I checked the distribution without these outliers. I didn't excluded these outliers from the data, I only excluded them to have a better understanding of the distribution of the features.

var_num=['avg_rating', 'avg_rating_recent','num_article_assets' , 'num_curriculum_items','num_lectures', 
         'num_practice_tests', 'num_quizzes','num_subscribers', 'num_reviews', 'price',  'published_since_month']
excluded_all=[]

for col in var_num:
    mean=df_courses[col].mean()
    std=df_courses[col].std()
    temp=df_courses[(df_courses[col]>mean-3*std) & (df_courses[col]<mean+3*std)]
    excluded_all.extend(list(set(df_courses.index)-set(temp.index)))
    fig, ax= plt.subplots(1,2,figsize=(16,4))
    ax[0].hist(temp[col], bins=20)
    ax[1].boxplot(temp[col])
    ax[0].set_title('Distribution of '+ str(col))
    ax[1].set_title('Boxplot of '+ str(col))
    print('Number of dropped values of attribute {} is {}.'.format(col, len(df_courses)-len(temp)))
excluded=set(excluded_all)
Number of dropped values of attribute avg_rating is 0.
Number of dropped values of attribute avg_rating_recent is 0.
Number of dropped values of attribute num_article_assets is 111.
Number of dropped values of attribute num_curriculum_items is 113.
Number of dropped values of attribute num_lectures is 115.
Number of dropped values of attribute num_practice_tests is 185.
Number of dropped values of attribute num_quizzes is 102.
Number of dropped values of attribute num_subscribers is 90.
Number of dropped values of attribute num_reviews is 59.
Number of dropped values of attribute price is 90.
Number of dropped values of attribute published_since_month is 11.
corr = df_courses[var_num].corr()
fig, ax=plt.subplots(figsize=(10,7))
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
<matplotlib.axes._subplots.AxesSubplot at 0x7f184355d290>
cols=['avg_rating','num_subscribers','published_since_month', 'num_reviews', 'price']
temp=df_courses[~df_courses.index.isin(excluded)]
sns.pairplot(temp[cols], plot_kws= {'alpha': 0.2})
<seaborn.axisgrid.PairGrid at 0x7f185ab72a50>

The followings can be seen from the pairplot above:

  • There is a positive correlation between the number of reviews/number of subscribers and the average rating - students normally give better ratings
  • As expected, there is a positive correlation between number of subscribers and number of reviews
  • There is also a positive correlation between published since and the average rating -> older courses have better ratings. This seems logic, since I would expect that courses which aren't popular won't stay long on the sortiment
  • The price doesn't have an effect on the average ratings
sns.pairplot(df_courses[['rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5']], markers="+",  plot_kws= {'alpha': 0.2})
<seaborn.axisgrid.PairGrid at 0x7f1853009f50>

Discrete variables

var_char=['has_certificate', 'is_paid', 'instructional_level', 'is_enrollable_on_mobile','is_owned_by_instructor_team',
          'is_practice_test_course', 'primary_category', 'primary_subcategory' ]
for col in var_char:
    temp=df_courses[col].value_counts()
    x_labels=temp.index
    plt.figure(figsize=(8, 4))
    ax = temp.plot(kind='bar', alpha=0.4)
    ax.set_title(col)
    ax.set_ylabel('Number of courses')
    ax.set_xticklabels(x_labels)
  
    rects = ax.patches
    labels = list(temp.values/temp.values.sum()*100)
    labels=[str(round(l,0))+'%' for l in labels]
    #for rect, label in zip(rects, labels):
        #height = rect.get_height()
        #ax.text(rect.get_x() + rect.get_width() / 2, height + 5, label, ha='center', va='bottom')
plt.show()
df_courses['earnings']=df_courses['price']*df_courses['num_subscribers']
df_subcategories=df_courses.groupby('primary_subcategory').agg({'num_subscribers':'sum',
                                               'avg_rating': 'mean',
                                               'price': 'mean',
                                               'earnings': 'sum'})    
df_subcategories
num_subscribers avg_rating price earnings
primary_subcategory
Business Analytics & Intelligence 4214709 3.999165 62.937681 3.075999e+08
Business Law 169513 3.362634 49.868537 5.868878e+06
Business Strategy 1088060 3.583135 53.535036 6.824195e+07
Communication 2950201 3.862910 60.089638 1.975740e+08
E-Commerce 4324642 3.834319 52.495375 2.031618e+08
Entrepreneurship 5385044 3.637652 52.207031 3.343825e+08
Human Resources 666135 3.506018 48.806028 3.935981e+07
Industry 228936 3.679406 60.106977 1.182660e+07
Management 2144868 3.646178 52.211575 1.392118e+08
Media 665708 3.828681 56.335990 3.642243e+07
Operations 688061 3.781765 51.565097 3.909176e+07
Other Business 875806 3.677553 49.139443 4.671434e+07
Project Management 2324836 3.815041 57.820537 1.626259e+08
Real Estate 532547 3.890769 60.011066 4.694027e+07
Sales 1186842 3.592184 57.008824 8.376575e+07
titles=['Total number of subscribers', 'Average rating of courses', 'Total earning on courses', 'Average price of courses']
fig, ax= plt.subplots(2,2,figsize=(16,10))
num=0
for i, col in enumerate(df_subcategories.columns):
    num+=1
    ax= plt.subplot(2,2, num)
    df_subcategories[col].plot(kind='bar', ax=ax, alpha=0.5)
    plt.title(titles[i])
    if num in range(3) :
        plt.tick_params(labelbottom='off')
plt.show()
  • Two subcategories have an average price higher than 100 dollars : The subcategory Data & analytics with 112, and Project Management with 104
  • The total earning on the courses is the highest in the subcategory for Data & Analytics and the second is in Entrepreneurship.
  • The total number of subscribers are the highest in the category of Entrepreneurship and in Data & Analytics

Attribute Objectives

I will analyse the attribute objectives of the courses to get a better understanding about the courses. At first I needed to transform the list of objectives into one string, and then investigate the frequencies of each word. I also implemented stemming: for that, I created a dataframe, where thee indexes are the stemmed words, and the values are the words which were stemmed. I needed it, to transform back the stemmed words. By means of the stemming similar words were counted as the same word (e.g. the words learn and learning are treated as one word).

import nltk
nltk.download('punkt')
nltk.download('stopwords')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
True
objectives_text=df_courses['objectives'].apply(combine_list)
vocab_frame_orig=vocab_stem(objectives_text)
vocab_frame_orig.head()
words
in in
this this
cours course
, ,
shani shani
vocab_frame=drop_words(vocab_frame_orig)
StopWords=set(stopwords.words('english')+list(punctuation)+["’", "n't", "'s", "--", "-", "...", "``", "''", "“", "039"])
top_words_graph(df_courses, 'objectives', True, 'bar', StopWords, vocab_frame)
top_words_graph(df_courses, 'objectives', True, 'wordcloud', StopWords, vocab_frame)

Attribute Description

vocab_frame_descr=vocab_stem(df_courses['description_text'])
vocab_frame_descr.head()
words
secret secret
& &
nbsp nbsp
; ;
sauc sauce
vocab_frame_descr=drop_words(vocab_frame_descr)
top_words_graph(df_courses, 'description_text', False, 'bar', StopWords, vocab_frame_descr)
top_words_graph(df_courses, 'description_text', False, 'wordcloud', StopWords, vocab_frame_descr)

EDA of the reviews dataset

df_reviews=pd.read_csv('df_reviews.csv', index_col=0)
df_reviews.head()
id created rating user_name
0 37926532 2019-09-26T14:14:13Z 5.0 Ryan McGovern
1 37914794 2019-09-26T09:04:09Z 3.0 Mohammad Rashad Nadeer Kutty
2 37851504 2019-09-25T07:33:44Z 5.0 Arth Patel
3 37835876 2019-09-25T02:08:23Z 5.0 A Kowsar Parveen
4 37834428 2019-09-25T01:30:27Z 5.0 A Nirmala
nr_user=df_reviews['user_name'].value_counts()
unique, counts = np.unique(nr_user, return_counts=True)
#dict(zip(unique, counts))
#most users (more than 600000) have only 1 review
ax, fig= plt.subplots(figsize=(10,4))

plt.bar(np.arange(len(counts[:20])), counts[:20], align='center')
plt.xticks(np.arange(len(counts[:20])), unique[:20])
plt.xlabel('number of reviews per user')
plt.ylabel('number of users')
plt.title('Number of reviews per user')
plt.show()
ax, fig= plt.subplots(figsize=(10,4))
nr_user[:20].plot(kind='bar', alpha=0.4)
<matplotlib.axes._subplots.AxesSubplot at 0x7f18326f8850>

Clustering and Recommender system

In this section, I cluster the courses and based on the new clusters and other course features, I build a recommender system.

For the clustering I investigated the attributes OBJECTIVES and DESCRIPTION. After the preparation of these two attributes, the first part of the notebook tries to cluster the courses based on the attribute OBJECTIVES, while in the second part I build the clusters by means of the course DESCRIPTIONs. After comparing the results, I used the the clustering algorithm based on the description field. The last part of the notebook shows the recommender system, that helps the user to find similar courses to the previously taken ones.

import pandas as pd
import numpy as np
import ast
import scipy.stats as st
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import pickle

Prepare the data

df_courses=pd.read_csv('df_courses.csv', index_col=0, sep=' ', converters={"objectives": ast.literal_eval})
df_courses.head()
avg_rating avg_rating_recent description_text has_certificate is_paid id instructional_level is_enrollable_on_mobile is_owned_by_instructor_team is_practice_test_course num_article_assets num_curriculum_items num_lectures num_practice_tests num_quizzes num_subscribers num_reviews objectives price published_title rating_1 rating_2 rating_3 rating_4 rating_5 published_since_month primary_category primary_subcategory
0 4.483728 4.485416 SECRET&nbsp;SAUCE&nbsp;OF&nbsp;GREAT&nbsp;WRIT... False False 171838 Beginner Level True False False 0 17 17 0 0 187239 13806 [In this course, Shani Raja, a former Wall Str... 0.0 secret-sauce-of-great-writing 0.009 0.020 0.124 0.359 0.489 89 Business Communication
1 4.452830 4.460957 The goals of this course are simple: To learn... False False 780078 All Levels True False False 0 23 23 0 0 129594 8198 [Speak fearlessly to a crowd (or face to face)... 0.0 fearless-public-speaker-bootcamp-by-ricky-mendoza 0.016 0.030 0.178 0.389 0.385 65 Business Communication
2 4.509386 4.539074 Important:This course is part of an integrated... False False 352440 All Levels True False False 2 20 13 0 7 107450 14781 [Learn the truth about the relationship of "Ag... 0.0 learn-the-truth-about-agile-versus-waterfall 0.003 0.013 0.123 0.440 0.421 80 Business Project Management
3 4.117857 4.070692 Learn Power BI Basics for Free !Power BI trans... False False 576504 Beginner Level True False False 0 21 21 0 0 93112 4246 [Learn all basics of Power BI, Learn Dashboard... 0.0 learn_power_bi_for_free 0.066 0.073 0.245 0.334 0.282 71 Business Business Analytics & Intelligence
4 4.490196 4.493055 HOW TO MAKE MONEY ON... False False 81534 All Levels True False False 2 24 24 0 0 221348 5432 [To build a $5,000 / month passive income webs... 0.0 how-to-build-an-online-business-a-complete-bus... 0.017 0.024 0.120 0.306 0.532 96 Business E-Commerce
df_reviews=pd.read_csv('df_reviews.csv', index_col=0)
df_reviews.head()
id created rating user_name
0 37926532 2019-09-26T14:14:13Z 5.0 Ryan McGovern
1 37914794 2019-09-26T09:04:09Z 3.0 Mohammad Rashad Nadeer Kutty
2 37851504 2019-09-25T07:33:44Z 5.0 Arth Patel
3 37835876 2019-09-25T02:08:23Z 5.0 A Kowsar Parveen
4 37834428 2019-09-25T01:30:27Z 5.0 A Nirmala

Pepare the attribute OBJECTIVES

The feature Objectives is a list of course objectives. At first I make a string from the list items by means of the function combine_list.

For the stemming I saved all words with their stemmed correspondence in the dataframe vocab_frame. Since I am interested only in the stemmed words, I dropped all the duplicates from this dataframe (e.g. I treat learn and learning as the same words). This dataframe will be used to transform back the stemmed words.

I defined the StopWords which contains all the expression that shouldn't be considered from the texts.

Finally I applied the TfidfVectorizer on the objectives attribute: This transformator builds feature vectors from text documents so, that it helps to identify words which are frequent in the text but rare in the corpus.

objectives_text=df_courses['objectives'].apply(combine_list)
vocab_frame_orig=vocab_stem(objectives_text)
#drop duplicates from the dataframe with stemmed words
vocab_frame=drop_words(vocab_frame_orig)
StopWords=set(stopwords.words('english')+list(punctuation)+["’", "n't", "'s", "--", "-", "...", "``", "''", "“", "039"])
vectorizer= TfidfVectorizer(stop_words=StopWords, tokenizer=tokenize, max_features=1000, max_df=0.8)
X=vectorizer.fit_transform(objectives_text)
X.shape
(7550, 1000)
word_features = vectorizer.get_feature_names()
word_features[50:55]
['analysi', 'analyst', 'analyt', 'analyz', 'ani']

Prepare the attribute DESCRIPTION

I executed the same steps as by the attribute Objectives except the combine_list functions: the attribute Description is alredy a string and not a list.

vocab_frame_descr=vocab_stem(df_courses['description_text'])
vocab_frame_descr=drop_words(vocab_frame_descr)
StopWords=set(stopwords.words('english')+list(punctuation)+["’", "n't", "'s", "--", "-", "...", "``", "''", "“", "039"])
vectorizer_descr= TfidfVectorizer(stop_words=StopWords, tokenizer=tokenize, max_features=1000, max_df=0.8)
X_descr=vectorizer_descr.fit_transform(df_courses['description_text'])
X_descr.shape
(7550, 1000)
word_features_descr = vectorizer_descr.get_feature_names()
word_features_descr[50:55]
['almost', 'along', 'alreadi', 'also', 'alway']

Clustering with the OBJECTIVES

K-Means Clustering - with k=15 clusters

At first I tried to create 15 clusters - there are 16 subcategories, but no need for category 'others'.

kmeans = KMeans(n_clusters = 15, n_init = 10, n_jobs = -1, random_state=1234)
kmeans.fit(X)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=15, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=1234, tol=0.0001, verbose=0)
common_words=get_common_words(kmeans, 10)
print_common_words(common_words, word_features, vocab_frame)
0 : data, visualization, using, analysis, learn, create, tableau, charts, dashboard, analytics
1 : learn, using, business, get, understand, create, course, works, markets, effectively
2 : project, management, risk, plan, using, understand, learn, schedule, scope, controls
3 : exam, pmp, questions, passing, practical, prepared, certification, tested, management, project
4 : products, amazon, sell, store, shopify, dropship, find, fba, create, custom
5 : business, markets, start, plan, understand, successful, ideas, strategies, create, grow
6 : presentations, communicate, audiences, speak, interview, confidence, skills, effectively, using, meeting
7 : writing, books, emails, publish, amazon, kindle, create, ebook, learn, using
8 : online, website, money, make, create, wordpress, business, income, start, build
9 : sales, custom, closing, sell, prospecting, learn, lead, techniques, markets, objectives
10 : team, coaching, management, leadership, develop, employees, leaders, works, motivate, effectively
11 : estate, real, property, investment, deal, investors, learn, build, understand, commercial
12 : agile, scrum, project, management, team, learn, products, understand, methodologies, principle
13 : students, able, course, learn, end, business, understand, management, basics, knowledge
14 : understand, management, using, process, works, custom, identify, create, develop, systems
squarify_words(common_words, word_features, vocab_frame)
df_courses['cluster']=kmeans.labels_
heatmap_categories_cluster('cluster', df_courses, 'Reds' )

Relationship between number of clusters (k) and inertia

I investigated the relationship between the number of clusters and the inertia (within-cluster sum-of-squares ) to find to optimal number of clusters. According to the elbow method, the line is an arm and the "elbow" on the arm is the value of k that is the best.

kRange = range(1,30)
inertia_Kmean = get_inertia(X, kRange)
plot_inertia(kRange, inertia_Kmean)
plt.plot([6], [inertia_Kmean[5]], 'o--', color='dimgray', linewidth=3)
plt.plot([1,6,11], [8520, 8170,7820], '--', color='k', linewidth=1)
plt.annotate("Let's try k=6", xy=(6, inertia_Kmean[5]), xytext=(6,7700),
             size=14, weight='bold', color='dimgray',
             arrowprops=dict(facecolor='dimgray', shrink=0.05))
Text(6, 7700, "Let's try k=6")

K-Means with k=6 clusters

It is hard to tell what is the optimal number of clusters from the graph. I tried several number of clusters and finally created 6 clusters with k-Means algorithm

kmeans = KMeans(n_clusters = 6, n_init = 10, n_jobs = -1, random_state=1234)
kmeans.fit(X)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=6, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=1234, tol=0.0001, verbose=0)
common_words=get_common_words(kmeans, 10)
print_common_words(common_words, word_features, vocab_frame)
0 : learn, understand, management, using, works, exam, course, create, team, effectively
1 : data, visualization, using, learn, analysis, create, tableau, charts, analytics, excellent
2 : products, online, amazon, sell, website, create, money, store, make, learn
3 : project, management, agile, risk, learn, using, understand, plan, pmp, exam
4 : business, start, learn, plan, markets, understand, successful, ideas, strategies, create
5 : sales, custom, sell, learn, closing, services, prospecting, markets, lead, understand
squarify_words(common_words, word_features, vocab_frame)
df_courses['cluster']=kmeans.labels_
heatmap_categories_cluster('cluster', df_courses, 'Reds')
plot_common_words(kmeans, 5, word_features, vocab_frame, df_courses, 'cluster')
print_titles_cluster(5, df_courses, 'cluster')
['secret-sauce-of-great-writing'
 'fearless-public-speaker-bootcamp-by-ricky-mendoza'
 'learn-the-truth-about-agile-versus-waterfall' 'learn_power_bi_for_free'
 'how-to-start-a-startup-business']
['how-to-build-an-online-business-a-complete-business-plan'
 'free-amazon-fba-course-for-beginners-2019-private-label'
 'start-a-business-in-5-days-with-shopify-and-productlistgenie'
 'amazon-to-ebay-dropshipping-home-business-starter-course'
 'web-developer-course-on-creating-a-business-website']
['sql-for-real-world-data-analysis' 'tableau-for-beginners-free'
 'power-bi-the-ultimate-orientation' 'unlock-the-story-from-data'
 'machinelearning-analytics']
['agile-project-management-utilizing-microsoft-project-planning-tools'
 'project-management-professional-pmp-intro'
 'enterprise-level-agile-project-management-course-7-of-7'
 'microsoft-project-project-management-scheduling-planning'
 'agile-methodologies-overview']
['unexpected-amazon' 'how-to-start-a-business-with-no-money-y'
 'plan-your-business' 'business-analysis-defined'
 'entrepreneurship-the-making-of-a-great-entrepreneur']
['become-a-magento-developer-front-end-cert-subtitled'
 'sales-rockstar-plan' 'entrepreneurship-natural-products'
 'sales-scripts-for-appointment-setting-b2b-quick-start'
 'mastery-course-training-automation-secrets']

Hierarchical Clustering

In this section I used hierarchical clustering. This method suppose that at the beginning the items have their own clusters. The algorithm starts to merge the individual clusters on by one. I created a dendrogram, which shows the distances between the clusters. I plotted the last 16 merges of the hierarchical clustering algorithm.

z=get_linkage(X )
plot_dendrogram(z, 16, line_dist=7.8)
#let's cut the dendrogrm at 7.8
df_courses['cluster_hier']=fcluster(Z=z, t=7.8, criterion='distance')
df_courses['cluster_hier'].value_counts()
3    4478
1    2135
2     496
5     230
4     211
Name: cluster_hier, dtype: int64
heatmap_categories_cluster('cluster_hier', df_courses, 'Reds' )

The distribution of the clusters through hierarchical clustering is very unproportional.

PCA for plotting the courses

I will do a simple PCA analysis and keep the first 2 principal components int order to plot the courses in 2D. I will use the results ofthe kmeans clustering (with 6 groups), since the hierarchical clustering resulted in an overproportional group.

plot_with_pca (X, df_courses['cluster'], 500)
The explained variance through the first 2 principal comonent is 0.0271.

In the 2-D plot, almost all the clusters are well separated from each other. Cluster0 and cluster5 overlap each other - in cluster 5 the majority of the courses can be found.

Clustering with the description

After building clusters with the objectives attribute, I investigated the course descriptions as the basis of the clustering algorithmns. I executed the same analyses and got better distributed clusters by means of the description feature.

K-Means clustering - k=15 clusters

At first I tried to create 15 clusters, similar to the previous clusterings witht the attribute objective. There are clusters with only a few courses, so I tried to optimize the number of clusters to build (k).

kmeans_descr = KMeans(n_clusters = 15, n_init = 10, n_jobs = -1, random_state=1234)
kmeans_descr.fit(X_descr)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=15, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=1234, tol=0.0001, verbose=0)
common_words=get_common_words(kmeans_descr, 10)
print_common_words(common_words, word_features_descr, vocab_frame_descr)
0 : writing, money, makes, get, online, freelancer, book, start, business, sell
1 : project, managed, pmp, risk, schedule, certification, plan, used, nbsp, learning
2 : data, analytics, visuals, tableau, used, analysis, bi, learning, power, statistics
3 : sales, sell, customers, prospecting, closing, learning, business, skills, get, makes
4 : business, start, marketing, plan, success, entrepreneur, learning, get, ideas, helped
5 : learning, used, business, customers, get, understanding, works, needed, helped, nbsp
6 : website, wordpress, creating, site, ecommerce, online, web, used, blog, build
7 : shopify, store, dropshipping, ecommerce, product, online, e-commerce, business, customers, start
8 : nbsp, business, learning, used, get, makes, managed, works, helped, start
9 : agile, scrum, project, team, managed, nbsp, product, certification, developed, mastering
10 : managed, team, organize, leadership, employee, perform, leaders, nbsp, change, learning
11 : estate, real, properties, investing, deals, investors, nbsp, agent, home, marketing
12 : amazon, product, fba, sell, seller, business, start, find, get, book
13 : exam, questions, pmp, testing, certification, pass, practicing, project, managed, prepare
14 : presentations, speaking, communicate, public, audience, skills, speaker, speech, people, learning
squarify_words(common_words, word_features_descr, vocab_frame_descr)
df_courses['cluster_descr']=kmeans_descr.labels_
heatmap_categories_cluster('cluster_descr', df_courses, 'Reds')
df_courses['cluster_descr'].value_counts()
5     2318
10     875
0      853
4      777
8      566
2      362
1      305
3      283
14     235
13     207
11     177
6      160
7      155
12     149
9      128
Name: cluster_descr, dtype: int64

Relationship between number of clusters and inertia by the describtion attribute

kRange = range(1,30)
inertia_Kmean = get_inertia(X_descr, kRange)
plot_inertia(kRange, inertia_Kmean)
plt.plot([8], [inertia_Kmean[7]], 'o--', color='dimgray', linewidth=3)
plt.plot([1,8,15], [8050, 7580,7110], '--', color='k', linewidth=1)
plt.annotate("Let's try k=8", xy=(8, inertia_Kmean[7]), xytext=(9,7800),
             size=14, weight='bold', color='dimgray',
             arrowprops=dict(facecolor='dimgray', shrink=0.05))
Text(9, 7800, "Let's try k=8")

K-Means with k=8 clusters

kmeans_descr = KMeans(n_clusters = 8, n_init = 10, n_jobs = -1, random_state=123456)
kmeans_descr.fit(X_descr)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=8, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=123456, tol=0.0001, verbose=0)
common_words=get_common_words(kmeans_descr, 10)
print_common_words(common_words, word_features_descr, vocab_frame_descr)
0 : managed, learning, used, team, business, understanding, works, skills, nbsp, helped
1 : data, analytics, visuals, tableau, used, analysis, bi, learning, power, statistics
2 : exam, questions, pmp, testing, certification, pass, practicing, project, managed, prepare
3 : nbsp, business, learning, get, used, managed, makes, helped, works, start
4 : get, online, makes, amazon, money, product, start, sell, website, creating
5 : sales, sell, customers, prospecting, closing, learning, business, skills, get, nbsp
6 : project, managed, agile, risk, pmi, certification, pmp, plan, nbsp, learning
7 : business, start, marketing, entrepreneur, success, plan, learning, get, ideas, helped
squarify_words(common_words, word_features_descr, vocab_frame_descr)
df_courses['cluster_descr']=kmeans_descr.labels_
heatmap_categories_cluster('cluster_descr', df_courses, 'Reds')
plot_common_words(kmeans_descr, 5, word_features_descr, vocab_frame_descr, df_courses, 'cluster_descr')
print_titles_cluster(3, df_courses, 'cluster_descr')
['secret-sauce-of-great-writing'
 'become-a-business-driven-enterprise-architect'
 'become-a-magento-developer-front-end-cert-subtitled']
['fearless-public-speaker-bootcamp-by-ricky-mendoza'
 'ab-testing-how-to-build-successful-product-experiments'
 'quickbooks-online-vs-quickbooks-desktop']
['learn-the-truth-about-agile-versus-waterfall'
 'agile-project-management-utilizing-microsoft-project-planning-tools'
 'painting-project-manager']
['learn_power_bi_for_free' 'sql-for-real-world-data-analysis'
 'tableau-for-beginners-free']
['how-to-build-an-online-business-a-complete-business-plan'
 'free-amazon-fba-course-for-beginners-2019-private-label'
 'start-a-business-in-5-days-with-shopify-and-productlistgenie']
['how-to-start-a-startup-business'
 '101-crucial-lessons-they-dont-teach-you-in-business-mba-school'
 'plan-your-business']
['salesleadgeneration' 'sales-rockstar-plan'
 'sales-scripts-for-appointment-setting-b2b-quick-start']
['get-your-pgmp-certification-on-your-first-attempt'
 'pmp-how-to-apply-for-pmp-course'
 'pmi-pfmp-and-pgmp-preparations-lessons-learned']

Hierarchical clustering

z_descr=get_linkage(X_descr )
plot_dendrogram(z, 16, line_dist=7.8)
#let's cut the dendrogrm at 7.8
df_courses['cluster_hier_descr']=fcluster(Z=z_descr, t=9.6, criterion='distance')
df_courses['cluster_hier_descr'].value_counts()
5    3213
4    3060
2     521
1     416
3     340
Name: cluster_hier_descr, dtype: int64
heatmap_categories_cluster('cluster_hier', df_courses, 'Reds' )

The distribution of the clusters through hierarchical clustering is very unproportional with the attribute description as well. In the further analysis I will use results of the k-means clustering with k=8.

PCA

plot_with_pca(X_descr, df_courses['cluster_descr'], 1000)
The explained variance through the first 2 principal comonent is 0.0558.

Export the clustering algorithm

filename ='kmeans8.sav'
pickle.dump(kmeans_descr, open(filename, 'wb'))
model_kmeans=pickle.load(open('kmeans8.sav', 'rb')) 
model_kmeans
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=8, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=123456, tol=0.0001, verbose=0)
values=model_kmeans.predict(X_descr)
(values==df_courses['cluster_descr']).sum()
7550

There are clusters, which are close to each other, e.g. cluster 3 is between clusters 0 and 8. Clusters 7, 4 and 1 are also adjacent. It is important to remember that I kept only 2 pricipal components, that explain 4% of the total variance (which is plotted on the graph). In contrast, the clusters are not reduced, they contain all the informations.

Building the recommender system

For the recommender system I use the course features together with the result of the k-means clustering with k=8. I transformed the course dataset into a features matrix by keeping only the relevant features (e.g. no need for course id) For the categorical variables I introduced dummy variables. The clusters were also transformed into dummy variables, since the order of the clusters doesn't have any meaning (cluster 0 is not better or worse than cluster 1). As the last step of the preparation I normalized the feature matrix, since the features have different scales. I used the cosine similarity to compare the courses which each other.

There are 2 functions, which can be used to recommend courses:

  • Function recommend_for_user recommends courses for the user based on his/her previous courses. This function takes the user as input.
  • Function recommend_courses recommends courses based on another course_id. This function takes the course_id as input and looks for the courses that are similar to the original course.

Keeping the relevant features and prepare the dataframe

rel_cols=['avg_rating',  'has_certificate',  'instructional_level', 'num_lectures','num_quizzes',
          'num_practice_tests','is_practice_test_course', 'num_article_assets', 'num_curriculum_items',
          'num_subscribers','num_reviews',  'price', 'primary_subcategory','cluster_descr']
df_rel=df_courses[rel_cols]
df_rel['has_certificate']=df_rel['has_certificate'].astype(int)
df_rel['cluster_descr']=df_rel['cluster_descr'].astype(str)
dummies=pd.get_dummies(df_rel[['primary_subcategory', 'instructional_level','cluster_descr']], prefix=['subcat', 'level', 'cluster'])
df_rel.drop(columns=['primary_subcategory', 'instructional_level', 'cluster_descr'], inplace=True)
df_rel=pd.concat([df_rel,dummies], axis=1)
df_rel.head()
avg_rating has_certificate num_lectures num_quizzes num_practice_tests is_practice_test_course num_article_assets num_curriculum_items num_subscribers num_reviews ... level_Expert Level level_Intermediate Level cluster_0 cluster_1 cluster_2 cluster_3 cluster_4 cluster_5 cluster_6 cluster_7
0 4.53965 1 311 8 0 False 27 319 380584 73048 ... 0 0 0 0 0 1 0 0 0 0
1 4.50200 1 151 0 0 False 10 151 192581 37914 ... 0 0 0 0 0 1 0 0 0 0
2 4.48619 1 91 0 0 False 26 91 133919 28313 ... 0 0 1 0 0 0 0 0 0 0
3 4.62332 1 92 7 0 False 2 99 78429 24114 ... 0 0 0 0 0 1 0 0 0 0
4 4.41484 1 89 15 0 False 33 104 187207 23214 ... 0 0 0 0 1 0 0 0 0 0

5 rows × 39 columns

df_norm=normalize_features(df_rel)
nr_user=df_reviews['user_name'].value_counts()
unique, counts = np.unique(nr_user, return_counts=True)
#dict(zip(unique, counts))
#recommend_for_user(user_name)
nr_user.sort_values()[:10]
Alina Shapoval              1
Crystal medin               1
Antony Jaramba              1
Kurian Just                 1
Nadine K-digitalMedia       1
Mithun Patlikadan           1
Abhilash Arvind Jangalve    1
Adam Haas                   1
Rini van Rijswijk           1
Pranay Chambhare            1
Name: user_name, dtype: int64
recommend_for_user('DEEPAK IYER', 5, df_reviews, df_courses, df_norm)
The following courses are recommended after taking the course scala-and-spark-for-big-data-and-machine-learning with the id 977062:
                                     published_title  cosine_similarity
1  microsoft-power-bi-up-running-with-power-bi-de...           0.970756
2                                      mastertableau           0.970164
3                machine-learning-course-with-python           0.938753
4                                          mspowerbi           0.936494
5                                sascompletetutorial           0.935621

recommend_for_user('Henk Bergsma', 5,df_reviews, df_courses, df_norm)
The following courses are recommended after taking the course machinelearning with the id 950390:
                                     published_title  cosine_similarity
1  python-for-data-science-and-machine-learning-b...           0.996493
2                          the-complete-sql-bootcamp           0.984643
3                                      r-programming           0.974843
4                                          tableau10           0.958937
5                                        datascience           0.954673

The following courses are recommended after taking the course artificial-intelligence-az with the id 1219332:
                                     published_title  cosine_similarity
1                                       deeplearning           0.961594
2                                      python-coding           0.936468
3                          the-complete-sql-bootcamp           0.920872
4                                      r-programming           0.920834
5  an-entire-mba-in-1-courseaward-winning-busines...           0.917184

The following courses are recommended after all taken courses:
                                     published_title  avg_cos_sim
1                          the-complete-sql-bootcamp     0.952757
2  python-for-data-science-and-machine-learning-b...     0.951159
3                                      r-programming     0.947839
4                                        datascience     0.935391
5  an-entire-mba-in-1-courseaward-winning-busines...     0.934693